load("rdata_file.Rdata")
#update
library(spotifyr)
library(genius)
library(tm)
library(wordcloud)
library(wordcloud2)
library(tidyr)
library(lexicon)
library(tidytext)
library(tidyverse)
library(dplyr)
library(XML)
library(gridExtra)
library(ggplot2)
# Get playlist categories
cats <- get_categories()
# Hiphop category
hiphop <- get_category_playlists("hiphop")
RapCaviar <- get_playlist("37i9dQZF1DX0XUsuxWHRQd")
MostNecessary <- get_playlist("37i9dQZF1DX2RxBh64BHjQ")
SignedXOXO <- get_playlist("37i9dQZF1DX2A29LI7xHn1")
# Pop category
pop <- get_category_playlists("pop")
TodaysTopHits <- get_playlist("37i9dQZF1DXcBWIGoYBM5M")
PopRising <- get_playlist("37i9dQZF1DWUa8ZRTfalHk")
# Rock category
rock <- get_category_playlists("rock")
RockThis <- get_playlist("37i9dQZF1DXcF6B6QPhFDv")
AltNOW <- get_playlist("37i9dQZF1DWVqJMsgEN0F4")
RockHard <- get_playlist("37i9dQZF1DWWJOmJ7nRx0C")
# Metal category
metal <- get_category_playlists("metal")
KickassMetal <- get_playlist("37i9dQZF1DWTcqUzwhNmKv")
NewCore <- get_playlist("37i9dQZF1DWXIcbzpLauPS")
NuMetalGeneration <- get_playlist("37i9dQZF1DXcfZ6moR6J0G")
# Indie category
indie <- get_category_playlists("indie_alt")
Alternative10s <- get_playlist("37i9dQZF1DX873GaRGUmPl")
EarlyAlternative <- get_playlist("37i9dQZF1DXdTCdwCKzXwo")
Alternative00s <- get_playlist("37i9dQZF1DX0YKekzl0blG")
This function takes in a Spotify Playlist Object and returns a data frame with track name, artist name, popularity score and lyrics. The 2 packages used were the spotifyr and genius package.
playlist_to_df <- function(playlist){
#creates df of Track Names and Pop Score
outputDF <- data.frame(playlist[["tracks"]][["items"]][["track.name"]],playlist[["tracks"]][["items"]][["track.popularity"]])
colnames(outputDF)<-c("name","popularity")
outputDF$name <-gsub("\\s*\\([^\\)]+\\)","",as.character(outputDF$name))
#removes all parenthesis and everything inside them from the name column
#Artist names are not easily accesible like the name and popularity score, so a loop was needed
vector1 <- character(nrow(outputDF)) #vector is allocated to hold artist names
for(i in 1:nrow(outputDF)){
vector1[i]<- playlist[["tracks"]][["items"]][["track.artists"]][[i]][["name"]][[1]]
}
#vector of artist names is added as a column to the output dataframe
outputDF$artist <- vector1
#genius package to get lyrics. Error is thrown if lyrics are unavailable or cannot be found.
#try block used to continue getting lyrics if 1 is missing.
for(i in 1:nrow(outputDF)){
try({
lyrics_from_genius <- genius_lyrics(artist = outputDF$artist[i],song = outputDF$name[i], info = "simple")
lyric_list <- lyrics_from_genius$lyric
outputDF$lyrics[i] <- paste(unlist(lyric_list),collapse=' ')
})
}
#Adds additional track audio features to our data frame. key, energy, loudness, mode etc...
playID <- playlist[["id"]]
features_df <- get_playlist_audio_features("spotify",playID)
try({
outputDF$danceability <- NA
outputDF$danceability <- features_df$danceability
outputDF$key <- features_df$key
outputDF$energy <- features_df$energy
outputDF$loudness <- features_df$loudness
outputDF$mode <- features_df$mode
outputDF$speechiness <- features_df$speechiness
outputDF$acousticness <- features_df$acousticness
outputDF$instrumentalness <- features_df$instrumentalness
outputDF$liveness <- features_df$liveness
outputDF$valence <- features_df$valence
outputDF$tempo <- features_df$tempo
})
#output DF is dataframe with name, artist, pop score, lyrics, and audio features.
return(outputDF)
}
playlistsNames <- list(RapCaviar, MostNecessary, SignedXOXO, TodaysTopHits,
PopRising, RockThis, AltNOW, RockHard, KickassMetal,
NewCore, NuMetalGeneration, Alternative10s, EarlyAlternative,
Alternative00s)
# Warning: this loop takes ~30min with 16gb of RAM.
list <- list()
for (i in 1:length(playlistsNames)) {
list[[i]] <- playlist_to_df(playlistsNames[[i]])
}
str(list)[[1]]
MrClean <- function(list){
list_avgs <- lapply(list, function(x) lapply(x, mean, na.rm=TRUE))
print(list_avgs)
str(list_avgs)
for (i in 1:length(list)) {
print(i)
# Set the text to lowercase
list[[i]]$lyrics <- tolower(list[[i]]$lyrics)
# Remove mentions, urls, emojis, numbers, punctuations, etc.
list[[i]]$lyrics <- gsub("@\\w+", "", list[[i]]$lyrics)
list[[i]]$lyrics <- gsub("https?://.+", "", list[[i]]$lyrics)
list[[i]]$lyrics <- gsub("\\d+\\w*\\d*", "", list[[i]]$lyrics)
list[[i]]$lyrics <- gsub("#\\w+", "", list[[i]]$lyrics)
list[[i]]$lyrics <- gsub("[^\x01-\x7F]", "", list[[i]]$lyrics)
list[[i]]$lyrics <- gsub("[[:punct:]]", "", list[[i]]$lyrics)
# Remove spaces and newlines
list[[i]]$lyrics <- gsub("\n", " ", list[[i]]$lyrics)
list[[i]]$lyrics <- gsub("^\\s+", "", list[[i]]$lyrics)
list[[i]]$lyrics <- gsub("\\s+$", "", list[[i]]$lyrics)
list[[i]]$lyrics <- gsub("[ |\t]+", " ", list[[i]]$lyrics)
#Replace NA with mean
list[[i]]$popularity <- replace_na(list[[i]]$popularity, as.integer(list_avgs[[i]]$popularity))
list[[i]]$danceability <- replace_na(list[[i]]$danceability, list_avgs[[i]]$danceability)
list[[i]]$key <- replace_na(list[[i]]$key, as.integer(list_avgs[[i]]$key))
list[[i]]$energy <- replace_na(list[[i]]$energy, list_avgs[[i]]$energy)
list[[i]]$loudness <- replace_na(list[[i]]$loudness, list_avgs[[i]]$loudness)
list[[i]]$mode <- replace_na(list[[i]]$mode, as.integer(list_avgs[[i]]$mode))
list[[i]]$speechiness <- replace_na(list[[i]]$speechiness, list_avgs[[i]]$speechiness)
list[[i]]$acousticness <- replace_na(list[[i]]$acousticness, list_avgs[[i]]$acousticness)
list[[i]]$instrumentalness <- replace_na(list[[i]]$instrumentalness, list_avgs[[i]]$instrumentalness)
list[[i]]$liveness <- replace_na(list[[i]]$liveness, list_avgs[[i]]$liveness)
list[[i]]$valence <- replace_na(list[[i]]$valence, list_avgs[[i]]$valence)
list[[i]]$tempo <- replace_na(list[[i]]$tempo, list_avgs[[i]]$tempo)
}
return(list)
}
clean_list <- MrClean(list)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
str(clean_list[[1]])
MrClean_Token <- function(list){
token1 <- list %>%
select(lyrics) %>%
unnest_tokens("word", "lyrics") %>%
anti_join(get_stopwords())
}
get_sentiment_data <- function(clean_list){
token <- MrClean_Token(clean_list)
token <- token %>%
anti_join(get_stopwords(), by = "word")
nrc_sent <- token %>%
select(word) %>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment) %>%
arrange(desc(n))
return(nrc_sent)
}
playlist_sentiment <- get_sentiment_data(clean_list[[1]])
## Joining, by = "word"
## Joining, by = "word"
str(playlist_sentiment)
## 'data.frame': 1343 obs. of 3 variables:
## $ word : chr "bitch" "bitch" "bitch" "bitch" ...
## $ sentiment: chr "anger" "disgust" "fear" "negative" ...
## $ n : int 378 378 378 378 378 298 298 298 168 168 ...
ggplot(playlist_sentiment, aes(x=sentiment, y=n)) +
geom_col()
#Hiphop - Setting up to combine playlists into genre
df1 <- clean_list[[1]]
df2 <- clean_list[[2]]
df3 <- clean_list[[3]]
#Pop - Setting up to combine playlists into genre
df4 <- clean_list[[4]]
df5 <- clean_list[[5]]
#Rock - Setting up to combine playlists into genre
df6 <- clean_list[[6]]
df7 <- clean_list[[7]]
df8 <- clean_list[[8]]
#Metal - Setting up to combine playlists into genre
df9 <- clean_list[[9]]
df10 <- clean_list[[10]]
df11 <- clean_list[[11]]
#Indie - Setting up to combine playlists into genre
df12 <- clean_list[[12]]
df13 <- clean_list[[13]]
df14 <- clean_list[[14]]
#Genres to choose from
Pop_s <- rbind(df4, df5)
Rock_s <- rbind(df6, df7, df8)
Hiphop_s <- rbind(df1, df2, df3)
Indie_s <- rbind(df12, df13, df14)
Metal_s <- rbind(df9, df10, df11)
#Bar plot function - emotions
bar_chart <- function(genre, title){
playlist1_sentiment <- get_sentiment_data(genre)
playlist1_sentiment_c <- subset(playlist1_sentiment, sentiment != "negative" & sentiment !="positive")
ggplot(playlist1_sentiment_c, aes(x=sentiment, y=n, fill=sentiment)) +
geom_col()+
ggtitle(title)+
xlab("Sentiment")+
ylab("Count")+
guides(fill = guide_legend(title = "Sentiment \nAnalysis"))+
theme(plot.title =element_text(hjust = 0.5, vjust = 1 ,face = "bold", size = 25),
axis.title.x = element_text( size = 16, vjust = -.75), axis.title.y = element_text( size = 16)
,legend.title = element_text(size = 12, face = "bold"), axis.text.x = element_blank())
}
#Bar plot function - positive vs negative
bar_chart_p <- function(genre, title){
playlist1_sentiment <- get_sentiment_data(genre)
playlist1_sentiment_c <- subset(playlist1_sentiment, sentiment == "negative"| sentiment =="positive")
ggplot(playlist1_sentiment_c, aes(x=sentiment, y=n, fill=sentiment)) +
geom_col()+
ggtitle(title)+
xlab("Polarity")+
ylab("Count")+
theme(plot.title =element_text(hjust = 0.5, vjust = 1 ,face = "bold", size = 25),
axis.title.x = element_text( size = 16), axis.title.y = element_text( size = 16),
legend.position = "none")
}
#Word cloud function
word_cloud <- function(genre){
get_dat <- get_sentiment_data(genre)
get_dat$sentiment <- NULL
get_dat <- distinct(get_dat)
names(get_dat)[2] <- "freq"
cloud <- wordcloud2(get_dat)
return(cloud)
}
word_cloud(Hiphop_s)
word_cloud(Indie_s)
word_cloud(Pop_s)
word_cloud(Metal_s)
word_cloud(Rock_s)